
#######################################################################################################
#                                 Portfolio Data Processing for Login Days
#
# Portfolio Data on Login Days:
# -----------------------------
# The portfolio data has been compiled into a long panel dataset that includes portfolio information 
# for every 'anon' account on all login days, including transaction days. In the processed data, 
# each observation uniquely represents a combination of a stock, date, and account.
#
# Data Storage:
# -------------
# To manage file size, the data is stored in two files (as explained in "cleaning_data1.R"):
# 1. "310319sell_day_portfolios_newacc_BIG_1_to_10000.csv"
# 2. "310319sell_day_portfolios_newacc_BIG_10000_to_more.csv"
#
# This script will merge these two datasets and compute necessary variables, including returns 
# since since purchase, for the regression analyses featured in the paper.
#
# Output Files:
# -------------
# - Cleaned data where each observation represents an account, stock, and date:
#   - `data_peak_prices_cleaned.csv`
#
# - Cross-sectional data where each observation represents an account:
#   - `data_peak_cross_sec_acc.csv`
#
# - Peak prices data where the peak is the highest price each investor has experienced since the purchase:
#   - `MAXC_peak_5update.csv` (using a minimum of 5 business days to update the peak)
#   - `MAXC_peak_20update.csv` (using a minimum of 20 business days to update the peak)
#
# - Peak prices determined within a one-year window:
#   - `peak_past_year_5update.csv` (using a minimum of 5 business days to update the peak)
#   - `peak_past_year_20update.csv` (using a minimum of 20 business days to update the peak)
#
#######################################################################################################



rm(list = ls(all = TRUE))
library(data.table)
library(ggplot2)
library(devtools)
library(stargazer)
library(statar)
library(stringi)
library(xtable)
library(lfe)
library(dplyr)
library(lubridate)



memory.limit(size = 10000000000000000000)
memory.limit()
Sys.setenv(LANG = "en")
Sys.setlocale("LC_TIME", "us")

wd_peak <- "D:/Peak_prices_19_data/"

# Reading data

data <- fread("D:/settings/beset/Desktop/output_new_acc/new_acc_only_sell_days/310319sell_day_portfolios_newacc_BIG_1_to_10000.csv")
data2 <- fread("D:/settings/beset/Desktop/output_new_acc/new_acc_only_sell_days/310319sell_day_portfolios_newacc_BIG_10000_to_more.csv")

# The data includes the following variables:
# port.date: The current date.
# last_td: The last recorded transaction date for a particular SEDOL (stock).
# pur_day: The purchase date.
# prior.posit: The stock position before the last transaction date.
# net.posit: The stock position on the given date.
# qwapp: The purchase price of the stock at the end of the day.
# qwapp_bef: The purchase price of the stock before the last transaction.
# Code_used_DS: The unique Datastream identifier for the security in the dataset.
# sell: A dummy variable indicating a sell transaction.
# buy: A dummy variable indicating a buy transaction.
# a_login: A dummy variable indicating a login on the day.
# narrative: Descriptive text related to the last recorded transaction. Examples include ‘BUY’ for purchase,
# ‘SELL’ for sale, ‘MOVEMENT ON’ for other adjustments, and ‘CONSOL’ for consolidation of holdings.


data<- rbind(data, data2)
data2<-NULL

data[,last_td:=as.Date(last_td, "%Y-%m-%d")]
data[,port_date:=as.Date(port_date, "%Y-%m-%d")]
data[, pur_day           :=as.Date(pur_day , "%Y-%m-%d")]
data[,.(narrative, anon, port_date,sedol, net.posit, prior.posit, qwapp, qwapp_bef, last_td, quantity)]

# Creating two variables:
# qwapp_yest: The purchase price of the stock on the previous day.
# posit_yest: The stock position on the previous day.

data[,posit_yest:=ifelse(port_date==last_td,  prior.posit,  net.posit)]    

data[,qwapp_yest:=ifelse(port_date==last_td & posit_yest==0, NA,        
                         ifelse(port_date==last_td & posit_yest!=0, qwapp_bef, 
                                qwapp))]                                       

# Code_used_DS is the unique Datastream identifier for each SEDOL in the dataset.
# Deleting accounts with duplicate observations caused by SEDOLs undergoing name changes, 
# as these accounts cannot be accurately matched with price data.

delete_accounts_change_name_sedols<- unique(data[!is.na(Code_used_DS) ,.N, by=.(anon, port_date, Code_used_DS)][N>1,.(anon)])

data <- data[!anon  %in% delete_accounts_change_name_sedols$anon]

data[, ":="(sedol=NULL, isin=NULL, name=NULL, anon_date=NULL, 
            af=NULL, s.nearest.td=NULL, nearest.td.overal=NULL,       
            s.nearest.td.overal=NULL, app=NULL)]  

names(data)

# Transactions can only occur when there is a login on the day. 

data[, raw_sell:=sell]
data[, raw_buy:=buy]
data[sell==1 & a_login==0, sell:=0]
data[buy==1 & a_login==0, buy:=0]

# Flagging positions that were just opened, where the position on the previous day was zero.
# This indicates that the stock was purchased today. There is no available return data for newly purchased sedols.

data[, positions_no_opened_on_sell_day:=0]
data[posit_yest>0, positions_no_opened_on_sell_day:=1] 

# Reading prices retrieved from Datastream

dd <- fread("D:/files_moved_19/output datastream/datastream data/310518_prices/datastream_20180601.csv")
dd[, date := as.Date(date, "%Y-%m-%d")]
dd <- dd[order(Code_used_DS, date)]

dd[, app_yest := shift(app, 1, NA, "lag"), by = .(Code_used_DS)]
dd[, upp_yest := shift(upp, 1, NA, "lag"), by = .(Code_used_DS)]

dd <- dd[, .(app, app_yest, upp, upp_yest, af, date, Code_used_DS)]
dd <- unique(dd)
dd[, port_date := date]

# Removing stocks with no matching prices

data <- data[!is.na(Code_used_DS)]
data <- merge(data, dd, by = c("Code_used_DS", "port_date"), all.x = TRUE)
data <- data[!is.na(app)]

# day of the week

data.and.login.date.m <- data[order(anon, Code_used_DS, port_date)]
data.and.login.date.m[, dow := wday(port_date, label = TRUE)]
data <- NULL

# As part of the account-level statistics, we include the percentage of active days with logins and the percentage of market-open days with transactions. 
# The following lines are used to calculate these values

library(zoo)

# Defining the active period for each account as the number of days from the first login or transaction to the last login

active_period<-data.and.login.date.m[,.(min(port_date, last_td, pur_day, na.rm=T), max(port_date)), by=.(anon)][, num_days:=as.numeric(V2-V1+1)]

data.and.login.date.m[, min_date_account1:=min(as.numeric(port_date), as.numeric(last_td), as.numeric(pur_day), na.rm=T), by=.(anon)]
data.and.login.date.m[,min_date_account:=as.Date.numeric(min_date_account1)]
data.and.login.date.m[, min_date_account1:=NULL]  

# Counting the total number of transactions

tr0<-unique(data.and.login.date.m[(narrative=="BUY" | narrative=="SELL"), 
                                  .(anon, last_td)])
tr0[, tr:=last_td]
tr1<- unique(data.and.login.date.m[!is.na(pur_day),.(anon, pur_day)])[,tr:=pur_day]
tr0_1<-unique(rbind(tr0[,.(anon,tr)], tr1[,.(anon, tr)]))[,.N, by=.(anon)] 
tr0_1

tr_sum<-tr0_1[, sum_tr:=N] 
tr_sum[,N:=NULL]

# Login counts 

login_sum<-unique(data.and.login.date.m[a_login==1,.(anon, port_date)])[,.N, by=.(anon)]

tr_login_sum<-merge(tr_sum,login_sum, by=c("anon"))[,.(anon, sum_tr, N)]
setnames(tr_login_sum, "N", "sum_login")
sum_up(tr_login_sum, d=T)

# Calculating the count of logins and transactions as a proportion of the total number of days the account was active

active_period<- merge(active_period, tr_login_sum, by=c("anon"), all.x=T)

# For transactions, we're interested in the count relative to the total number of business market open days, not just any day.
# Thus, we'll now divide the count by the number of business days

library(bizdays)

create.calendar(name='WeekendsOnly', weekdays=c('sunday', 'saturday'))

from_dates <- '2013-01-01'
to_dates <- seq(as.Date('2013-12-31'), as.Date('2020-12-31'), by='years')
bizdays(from_dates, to_dates, 'WeekendsOnly')

active_period[, num_busines_days:= bizdays(V1, V2, 'WeekendsOnly')+1]
active_period[, ":=" (months_w_logins100=(sum_login/num_days)*100, 
                      months_w_tr100=(sum_tr/num_busines_days)*100)] 

# Flagging missing purchase prices due to transfers from external accounts

data.and.login.date.m[, missing_pur.price:=0]
data.and.login.date.m[is.na(qwapp), missing_pur.price:=1]

# Counting the number of stocks held on any given day, targeting days when the investor held a minimum of 2 stocks

data.and.login.date.m[, Count_sedols := NULL]
data.and.login.date.m[missing_pur.price==0, `:=`( Count_sedols = .N ) , by = .(anon, port_date) ]
data.and.login.date.m[, Count_sedols2:=max(Count_sedols, na.rm=T), by=.(anon, port_date)]
data.and.login.date.m[, Count_sedols:=Count_sedols2]
data.and.login.date.m[, Count_sedols2:=NULL]
data.and.login.date.m[is.na(Count_sedols), Count_sedols:=0]

# Removing stocks with unknown purchase prices and retaining portfolios with at least two stocks

data.and.login.date.m <- data.and.login.date.m[a_login==1 & Count_sedols>1 & missing_pur.price==0]


#////////////////////////////////////////////////////////////////////////////////////
#
# Creating account-level variables to be used for the summary statistics table in the paper.
# These will be incorporated into the 'for_stat_acc' data.table later in the code.
#
#////////////////////////////////////////////////////////////////////////////////////

# Portfolio value 

data.and.login.date.m[,   stock_value:=net.posit* app]
data.and.login.date.m[, portfolio_value_all_stocks:=sum(stock_value, na.rm=T),  by=.(anon, port_date)]

# Identifying which stocks are mutual funds. We have a dataset from Datastream containing this information, which we will merge with our data

static_inductry<-fread("D:/files_moved_19/output datastream/datastream data/260118 isins sedols recovered static and industry.csv")
static_inductry<-unique(static_inductry[!is.na(code_used_ds),.(code_used_ds , INDUSTRY_GROUP2_NEW)])
static_inductry<-static_inductry[INDUSTRY_GROUP2_NEW=="Mutual Funds"]
setnames(static_inductry, "code_used_ds", "Code_used_DS")

data.and.login.date.m <- merge(data.and.login.date.m,static_inductry, by=c("Code_used_DS"),  all.x=TRUE)
data.and.login.date.m[INDUSTRY_GROUP2_NEW=="Mutual Funds", MF_value:=sum(net.posit*app, na.rm=T),  by=.(anon, port_date)]
data.and.login.date.m[,MF_value:=mean(MF_value, na.rm=T),  by=.(anon, port_date)]
data.and.login.date.m[is.na(MF_value), MF_value:=0] 

# Calculating the portfolio value, the number of stocks, and the percentage of the portfolio invested in mutual funds at the beginning of each month

for_stat<- unique(data.and.login.date.m[,.(anon, port_date, portfolio_value_all_stocks, MF_value, Count_sedols)])
for_stat[, per_MF:=MF_value/portfolio_value_all_stocks]
for_stat[is.na(per_MF), per_MF:=0] 

for_stat<- for_stat[order(anon, port_date)]
for_stat[, Month_Yr := format(as.Date(port_date), "%Y-%m") ]
for_stat_monthly<-for_stat[,head(.SD,1), by=.(anon, Month_Yr)]
for_stat_monthly<-for_stat_monthly[, .(port_val10000=mean(portfolio_value_all_stocks)/10000, 
                                       MF_val10000=mean(MF_value)/10000, 
                                       number_stock=mean(Count_sedols), 
                                       per_MF=mean(per_MF)),
                                   by=.(anon)]

for_stat_acc<- merge(active_period, for_stat_monthly, by=c("anon"))

# Retrieving demographic information for each investor, which includes age, gender, and account tenure

master <-fread("C:/Barclays/smart_investor/data/anonymous_customer_level_data_merged_master.csv") 

master<- master[!is.na(anon_portfolio_id), .(anon_portfolio_id,  gender ,  dob, portfolio_open_date, portfolio_type)]
master<- master[anon_portfolio_id %in% for_stat_acc$anon]
master[, portfolio_open_date:=as.Date(portfolio_open_date , "%Y-%m-%d")]

master[,age:=2017-(as.numeric(dob))]
master[age>=87,age:=87]

# Calculating account tenure

master_open_date<-master[,.(anon_portfolio_id, portfolio_open_date, gender, age)]
setnames(master_open_date,"anon_portfolio_id", "anon")
data.and.login.date.m <-merge(data.and.login.date.m, master_open_date, by=c("anon"), all.x=T)
data.and.login.date.m[portfolio_open_date>= min_date_account, adjusted_open_date:=(min_date_account-1)]
data.and.login.date.m[portfolio_open_date< min_date_account, adjusted_open_date:=portfolio_open_date]
data.and.login.date.m[is.na(portfolio_open_date), adjusted_open_date:=(min_date_account-1)]

data.and.login.date.m[, account_tenure_years_to2016:=as.numeric((as.Date("2016-03-29" , "%Y-%m-%d")-adjusted_open_date)/365)]

open_date_account_dt<-unique(data.and.login.date.m[,.(anon,  adjusted_open_date)])
setnames(master,"anon_portfolio_id", "anon")

master<- merge(master, open_date_account_dt, by=c("anon"))
master[, account_tenure_years_to2016:=as.numeric((as.Date("2016-03-29" , "%Y-%m-%d")-adjusted_open_date)/365)]

data.and.login.date.m[,age_label:=cut(age, seq(20,80,10), right=TRUE)]
master[,age_label:=cut(age, seq(20,80,10), right=TRUE)]

for_stat_acc<- merge(for_stat_acc, master, by=c("anon"), all.x=T)

# Dummy for female investors

for_stat_acc[, female:=ifelse(gender=="F", 1, 0)]

# Including accounts with positive portfolio value 

for_stat_acc<- for_stat_acc[port_val10000>0]

# Excluding accounts with missing demographic data

for_stat_acc<- for_stat_acc[!is.na(age) & !is.na(months_w_tr100)]
acc_with_demo<-unique(for_stat_acc$anon)
data.and.login.date.m<- data.and.login.date.m[anon %in% acc_with_demo]


# Removing days in which the stocks were purchased (days when stocks started a positive position) since we are interested in analysing trades for
# stocks that have been held at least one day in the portfolio

data.and.login.date.m<-data.and.login.date.m[positions_no_opened_on_sell_day==1] 

acc_with_final<-unique(data.and.login.date.m$anon)
for_stat_acc<- for_stat_acc[anon %in% acc_with_final]

# Removing accounts with no remaining selling days

acc_with_sells<-data.and.login.date.m[sell==1,.N, by=.(anon)]$anon
data.and.login.date.m<- data.and.login.date.m[anon %in% acc_with_sells]

data.and.login.date.m_only_sell_days<- data.and.login.date.m[num_sales_the_day>0]
include_anon<-unique(data.and.login.date.m_only_sell_days$anon)
data.and.login.date.m<- data.and.login.date.m[anon %in% include_anon] 

for_stat_acc_clean<- for_stat_acc[anon %in% include_anon] 
for_stat_acc<- unique(for_stat_acc_clean)

# Computing returns

data.and.login.date.m[,return.since.pur:=(app-qwapp)/qwapp]

# Generating dummy variables for gains and losses

data.and.login.date.m[,gain.since.pur:=ifelse(return.since.pur>0, 1, 0)]
data.and.login.date.m[,loss.since.pur:=ifelse(return.since.pur<0, 1, 0)] 

data.and.login.date.m[,return.since.pur100:=return.since.pur*100]



# Flagging stock purchases or top-ups, as these can change the weighted purchase price.

buys_happenidng<-data.and.login.date.m[,.(buy,anon, Code_used_DS, port_date, pur_day, qwapp,
                                          qwapp_bef, last_td, app, 
                                          quantity, narrative)]
buys_happenidng[qwapp!=qwapp_bef, buy_happened:=1]
buys_happenidng[qwapp!=qwapp_bef, buy_happened_day:=last_td]
buys_happenidng[qwapp!=qwapp_bef,new_purprice:=qwapp]

buys_happenidng[is.na(qwapp_bef) & !is.na(qwapp) & last_td==pur_day, buy_happened:=1]
buys_happenidng[is.na(qwapp_bef) & !is.na(qwapp) & last_td==pur_day, buy_happened_day:=last_td]
buys_happenidng[is.na(qwapp_bef) & !is.na(qwapp) & last_td==pur_day,new_purprice:=qwapp]

buys_happenidng<- buys_happenidng[!is.na(buy_happened_day)]

adding_buys<-unique(buys_happenidng[narrative=="BUY",.(anon, Code_used_DS, last_td, narrative, qwapp)])
setnames(adding_buys, "qwapp", "new_purprice2")
setnames(adding_buys, "last_td", "buy_happened_day")
adding_buys[,adding_buys:=1]
adding_buys[,narrative:=NULL]

buys_happenidng2<-merge(buys_happenidng, adding_buys, by=c("anon", "Code_used_DS", "buy_happened_day" ), all=T)

# new_purprice is the purchase price for a new stock purchase
# new_purprice2 is the purchase price following top-ups

buys_happenidng2[(new_purprice-new_purprice2)>0.0001] 
buys_happenidng2[adding_buys==1 & is.na(buy_happened), buy_happened:=1]

buys_happenidng2<- buys_happenidng2[ buy_happened==1]
buys_happenidng2[is.na(new_purprice), new_purprice:=new_purprice2]
buys_happenidng2<-  unique(buys_happenidng2[,.(anon,Code_used_DS, buy_happened_day, new_purprice)])

# Deleting duplicates  

buys_happenidng2[,order:=seq_along(new_purprice), by=.(anon, Code_used_DS, buy_happened_day)]
buys_happenidng2<-buys_happenidng2[order==1]

# buys_happenidng2 includes the purchase price for buy days, including purchase days.

list_anon<-unique(data.and.login.date.m[,.(anon)])


#////////////////////////////////////////////////////////////////////////////////////
#
# Defining peak prices
# --------------------
# We consider the highest price the investor has experienced with the stock since its purchase.
# The peak price is the highest historical value since the stock was acquired. Typically, we use the market price to define this peak.
# However, if an investor tops up and pays more than the market closing price, we use the price they paid to calculate the peak.
# The peak price is the highest price the investor has experienced since purchase, with the minimum value being the purchase price.

# In the paper, we also calculate a second type of peak price: peaks that occurred in the past year.
#
#////////////////////////////////////////////////////////////////////////////////////

# The following function `finding_peak` identifies the peak price within the past year or any specified window of days 
# (considering only business days).
# In this function, `update_days` specifies the number of days a peak must remain the highest 
# value for investors to consider it a valid peak.
# In the paper, we use one week and one month as examples, but any value can be used.
# Setting `show_merged=YES` will display observations including all login dates; otherwise, 
# it will include only days when the market was open.  
  

finding_peak<- function(number_anon, show_merged, update_days, window_days) {
  
  anon_used=list_anon[number_anon,]$anon 
  stocks<- unique(data.and.login.date.m[anon==anon_used]$Code_used_DS)
  update<-unique(buys_happenidng2[anon==anon_used,.(Code_used_DS, buy_happened_day, new_purprice)])
  setnames(update,"buy_happened_day", "date" )
  dd_for_update_anon<- dd[Code_used_DS %in% stocks][,.(app,   date, Code_used_DS)]
  
  dd_for_update_anon<- merge(dd_for_update_anon, update, by=c("Code_used_DS", "date"), all.x=T)
  dd_for_update_anon[, app_for_peak:=ifelse( app>= new_purprice | is.na(new_purprice), app, new_purprice)]
  dd_for_update_anon[,app:=NULL]
  dd_for_update_anon[,new_purprice:=NULL]
  
  setnames(dd_for_update_anon, "app_for_peak", "app")
  
  # potential peaks
  dd_for_update_anon[, app_yest:=shift(app,1,NA,"lag") , by=Code_used_DS]
  dd_for_update_anon[, app_tmr:=shift(app,1,NA,"lead") ,by=.(Code_used_DS)] 
  dd_for_update_anon[, is_peak:=0]
  dd_for_update_anon[app_yest<app & app_tmr<app,  is_peak:=1]
  dd_for_update_anon[is_peak==1,MaxPrice_pot:=app] 
  
  # We use rollmax to get the highest price within a given window.
  # However, we also created an additional variable (MaxPrice_pot_index) to help identify the date of the peak, not just the peak value.
  # To achieve this, we created an index (MaxPrice_pot_index) where each value is calculated as (price * 100000) + a small value (index time / 100000).
  # By applying rollmax to both the price and this index, we can recover the exact date of the peak.
  

  dd_for_update_anon[,index_time:=seq_along(date), by=.(Code_used_DS)]
  dd_for_update_anon[,app100000:=app*100000]
  dd_for_update_anon[, app100000_2:=app100000 + index_time / 100000]
  dd_for_update_anon[, indextrial:=round((app100000_2- app100000)*100000)]
  dd_for_update_anon[is_peak==1,MaxPrice_pot_index:=app100000_2] 
  dd_for_update_anon[is.na(MaxPrice_pot), MaxPrice_pot:=0] 
  dd_for_update_anon[is.na(MaxPrice_pot_index), MaxPrice_pot_index:=0]
  
  window= window_days 
  dd_for_update_anon[, max_window:=as.numeric(rollmax(MaxPrice_pot, window, 
                                                      fill = NA, align = "right")), by=.(Code_used_DS)]
  
  dd_for_update_anon[, max_windowindex:=as.numeric(rollmax(MaxPrice_pot_index, window, 
                                                           fill = NA, align = "right")), by=.(Code_used_DS)]
  
  dd_for_update_anon[, indextrialMax:=round((max_windowindex- max_window*100000)*100000)] 
  
  dd_for_update_anon[,max_windowindex:=NULL]
  dd_for_update_anon[,MaxPrice_pot_index:=NULL]
  dd_for_update_anon[,indextrial:=NULL]
  dd_for_update_anon[,app100000:=NULL]
  dd_for_update_anon[,app100000_2:=NULL]
  
  dd_for_update_anon[max_window==0, max_window:=NA]
  dd_for_update_anon[indextrialMax==0, indextrialMax:=NA]

  dd_for_update_anon<-dd_for_update_anon[order(Code_used_DS, date)]

  # When there is no peak in the past window days, we select the closest peak value.
  cols <- c( "indextrialMax")
  (dd_for_update_anon[, (cols) := na.locf(.SD, na.rm = F), by=Code_used_DS,  .SDcols = cols]) 
  
  cols <- c( "max_window")
  (dd_for_update_anon[, (cols) := na.locf(.SD, na.rm = F), by=Code_used_DS,  .SDcols = cols]) 
  
  dd_for_update_anon[ , max_window_lag:=shift(max_window,1,NA,"lag") ,by=.(Code_used_DS)] 
  dd_for_update_anon[ , indextrialMax_lag:=shift(indextrialMax,1,NA,"lag") ,by=.(Code_used_DS)] 
  
  # We currently have the peak value up to today, but we need the peak value as of yesterday, excluding today's price from being considered a peak.

  setnames(dd_for_update_anon,"max_window_lag", "app_past_peak")
  setnames(dd_for_update_anon,"indextrialMax_lag", "index_past_peak")
  
  dd_for_update_anon[, max_window:=NULL]
  dd_for_update_anon[,indextrialMax:=NULL]
  dd_for_update_anon[,MaxPrice_pot:=NULL]
  dd_for_update_anon[,is_peak:=NULL]

  dates_index<-unique(dd_for_update_anon[,.(index_time,date,Code_used_DS)])
  setnames(dates_index, "date", "date_past_peak")
  setnames(dates_index, "index_time", "index_past_peak")
  
  dd_for_update_anon<- merge(dd_for_update_anon, dates_index, by=c("Code_used_DS", "index_past_peak"), all.x=T)
  
  dd_for_update_anon[, distance_to_peak:=as.numeric(date-date_past_peak)]

  dd_for_update_anon[,pass:=0]
  dd_for_update_anon[app>app_past_peak, pass:=1]
  dd_for_update_anon[is.na(app_past_peak), pass:=NA]
  
  setnames(dd_for_update_anon, "date", "port_date")
  setnames(dd_for_update_anon, "app", "appDS")
  setnames(dd_for_update_anon, "pass", "pass_peak")
  
  # update_days indicates how many days a peak must remain the highest value for investors to consider it a true peak.
  # In the paper, we use one week and one month, but any value can be input.
  
  dd_for_update_anon[, app_yest:=NULL]
  dd_for_update_anon[, app_tmr:=NULL]
  
  dd_for_update_anon[, app_past_peak_rest:=NULL]
  dd_for_update_anon[, date_past_peak_rest:=NULL]
  
  dd_for_update_anon[, app_past_peak_Nlag:=shift(app_past_peak,update_days,NA,"lag") ,by=.(Code_used_DS)] 
  dd_for_update_anon[app_past_peak==app_past_peak_Nlag, app_past_peak_rest:=app_past_peak]
  
  dd_for_update_anon[ , date_past_peak_Nlag:=shift(date_past_peak,update_days,NA,"lag") ,by=.(Code_used_DS)] 
  dd_for_update_anon[date_past_peak==date_past_peak_Nlag, date_past_peak_rest:=date_past_peak]
  
  dd_for_update_anon[is.na(date_past_peak_rest), app_past_peak_rest:=NA]
  
  cols <- c( "app_past_peak_rest")
  (dd_for_update_anon[, (cols) := na.locf(.SD, na.rm = F), by=Code_used_DS,  .SDcols = cols]) 
  
  cols <- c( "date_past_peak_rest")
  (dd_for_update_anon[, (cols) := na.locf(.SD, na.rm = F), by=Code_used_DS,  .SDcols = cols]) 
  
  dd_for_update_anon[,app_past_peak_Nlag:=NULL]
  dd_for_update_anon[,date_past_peak_Nlag:=NULL]
  dd_for_update_anon[,index_past_peak :=NULL] 
  dd_for_update_anon[,index_time:=NULL]
  dd_for_update_anon[,pass_peak_rest:=0]
  dd_for_update_anon[appDS>app_past_peak_rest, pass_peak_rest:=1]
  dd_for_update_anon[is.na(app_past_peak_rest), pass_peak_rest:=NA]
  

  anon_period<-unique(data.and.login.date.m[anon==anon_used,.(anon, Code_used_DS, port_date)])
  
  anon_period<-merge(anon_period, dd_for_update_anon, by=c("Code_used_DS",  "port_date"), all.x=T)
  
  print(number_anon)
  
  if (show_merged=="YES"){ # observations include all login dates
    anon_period
  } else{
    dd_for_update_anon[, anon:=anon_used] # observations include all market opend days
    dd_for_update_anon
  }
}





#####################################
# Running Peak Price Analysis Across All Accounts
# -----------------------------------------------
# We are executing the `finding_peak` function across all accounts in the dataset.
# First, this analysis uses a one-year window to identify peak prices with an `update_days` value of 5,
# which corresponds to a week of business days. 
# The function determines the highest price each investor experienced within the past year,
# updating the peak price if a new high is sustained for at least 5 business days.
# The output is stored in `peak_past_year_5update.csv`.
#
# Next, we rerun the function with an `update_days` value of 20 (corresponding to approximately one month).
# The output for this run is stored in `peak_past_year_20update.csv`.
#
#####################################

{
  list_peak_anon<- lapply(1:nrow(list_anon), function(x)   {finding_peak(number_anon = x , show_merged = "YES", 
                                                                         update_days = 5, window_days = 262 )   }  )
  df.list <- setDT(rbindlist(list_peak_anon, fill=TRUE)  )
  
  df.list<-unique(df.list)
  df.list[,order:=seq_along(appDS), by=.(anon, Code_used_DS, port_date)]
  df.list<-df.list[order==1]
  df.list[,order:=NULL]
  
  setnames(df.list, "appDS","appDS_up5_year")
  setnames(df.list, "app_past_peak", "app_past_peak_NOup5_year") 
  setnames(df.list, "date_past_peak", "date_past_peak_NOup5_year")
  setnames(df.list, "distance_to_peak", "distance_to_peak_NOup5_year")
  setnames(df.list, "pass_peak", "pass_peak_NOup5_year")
  setnames(df.list, "app_past_peak_rest", "app_past_peak_up5_year")
  setnames(df.list, "date_past_peak_rest", "date_past_peak_up5_year")
  setnames(df.list, "pass_peak_rest", "pass_peak_up5_year")
  
  # Relevant Variables:
  # -------------------
  # app_past_peak_up5_year:    The peak price within the past year that remained the highest for a minimum of 5 days.
  # date_past_peak_up5_year:   The date when the `app_past_peak_up5_year` peak price was recorded.
  # pass_peak_up5_year:        A dummy variable equal to one if the current price has surpassed the `app_past_peak_up5_year` peak price.


  name_file_use<- paste0("peak_past_year_5update.csv")
  
  output_file <- file.path(wd_peak, name_file_use)           
  
  fwrite(df.list , file=output_file, sep=',', row.names=F, col.names=T)
  

  df.list<- NULL
  list_peak_anon<- NULL
  
  list_peak_anon<- lapply(1:nrow(list_anon), function(x)   {finding_peak(number_anon = x , show_merged = "YES", 
                                                                         update_days = 20, window_days = 262 )   }  )
  df.list <- setDT(rbindlist(list_peak_anon, fill=TRUE)  )
  
  
  df.list<-unique(df.list)
  df.list[,order:=seq_along(appDS), by=.(anon, Code_used_DS, port_date)]
  df.list<-df.list[order==1]
  df.list[,order:=NULL]

  setnames(df.list, "appDS","appDS_up20_year")
  setnames(df.list, "app_past_peak", "app_past_peak_NOup20_year") 
  setnames(df.list, "date_past_peak", "date_past_peak_NOup20_year")
  setnames(df.list, "distance_to_peak", "distance_to_peak_NOup20_year")
  setnames(df.list, "pass_peak", "pass_peak_NOup20_year")
  setnames(df.list, "app_past_peak_rest", "app_past_peak_up20_year")
  setnames(df.list, "date_past_peak_rest", "date_past_peak_up20_year")
  setnames(df.list, "pass_peak_rest", "pass_peak_up20_year")
  
  # Relevant Variables:
  # -------------------
  # app_past_peak_up20_year:   The peak price within the past year that remained the highest for a minimum of 20 days.
  # date_past_peak_up20_year:  The date when the `app_past_peak_up20_year` peak price was recorded.
  # pass_peak_up20_year:       A dummy variable equal to one if the current price has surpassed the `app_past_peak_up20_year` peak price.
  
  
  name_file_use<- paste0("peak_past_year_20update.csv")
  
  output_file <- file.path(wd_peak, name_file_use)           
  
  fwrite(df.list , file=output_file, sep=',', row.names=F, col.names=T)
}





# The following function, `finding_peak_MAX`, identifies the peak price since the investor purchased the stock, 
# rather than within a specified window.
# In this function, `update_days` specifies the number of days a peak must remain the highest value 
# for investors to consider it a valid peak.



finding_peak_MAX<- function(number_anon, show_merged ,update_days) {
  anon_used=list_anon[number_anon,]$anon 
  stocks<- unique(data.and.login.date.m[anon==anon_used]$Code_used_DS)
  update<-unique(buys_happenidng2[anon==anon_used,.(Code_used_DS, buy_happened_day, new_purprice)])
  setnames(update,"buy_happened_day", "date" )
  
  dd_for_update_anon<- dd[Code_used_DS %in% stocks][,.(app,   date, Code_used_DS)]
  
  dd_for_update_anon<- merge(dd_for_update_anon, update, by=c("Code_used_DS", "date"), all.x=T)
  dd_for_update_anon[, app_for_peak:=ifelse( app>= new_purprice | is.na(new_purprice), app, new_purprice)]
  dd_for_update_anon[,app:=NULL]
  dd_for_update_anon[,new_purprice:=NULL]
  
  setnames(dd_for_update_anon, "app_for_peak", "app")
  dd_for_update_anon[,anon:=anon_used]
  dd_for_update_anon[,from_DD:=1]
  
  purdayslist<- unique(data.and.login.date.m[anon==anon_used, .(Code_used_DS, port_date, pur_day, net.posit)]) 
  purdayslist2<- unique(purdayslist[,.(Code_used_DS, pur_day)]) 
  purdayslist2[, port_date:=pur_day]  
  
  purdayslist<-merge(purdayslist, purdayslist2, by=c("Code_used_DS", "port_date"), all=T)
  purdayslist[, from_login:=1]
  purdayslist[is.na(pur_day.x), pur_day.x:=pur_day.y]
  purdayslist[, pur_day.y:=NULL]
  
  dd_for_update_anon<- dd_for_update_anon[,.(date, Code_used_DS, anon, app, from_DD)]
  setnames(dd_for_update_anon, "date", "port_date")
  
  dd_for_update_anon<- merge(dd_for_update_anon, purdayslist, by=c("Code_used_DS", "port_date"), all.x=T)
  
  dd_for_update_anon[!is.na(pur_day.x), max_date:=max(port_date), by=.(Code_used_DS, pur_day.x)]
  
  dd_for_update_anon<- dd_for_update_anon[order(Code_used_DS, port_date)]

  cols <- c( "pur_day.x")
  (dd_for_update_anon[, (cols) := na.locf(.SD, na.rm = F), by=Code_used_DS,  .SDcols = cols]) 
  
  cols <- c( "max_date")
  (dd_for_update_anon[, (cols) := na.locf(.SD, na.rm = F), by=Code_used_DS,  .SDcols = cols]) 
  
  dd_for_update_anon<-dd_for_update_anon[port_date >= pur_day.x & port_date<= max_date]
  
  dd_for_update_anon<- dd_for_update_anon[,.(Code_used_DS , port_date,  anon,      app, pur_day.x)]
  
  # We want to identify the maximum price, considering all prices from the purchase date up until the price from yesterday.

  dd_for_update_anon[ , app_yest:=(shift(as.character(app),1,NA,"lag")) ,by=.(Code_used_DS, pur_day.x)] 
  
  dd_for_update_anon[ , port_date_yest:=(shift(as.character( port_date),1,NA,"lag")) ,by=.(Code_used_DS, pur_day.x)] 
  
  dd_for_update_anon[!is.na(app_yest),MaxPriceSoFar:=cummax(app_yest),by=list(Code_used_DS, pur_day.x)]
  
  # If yesterday's price is the current maximum and today's price exceeds yesterday's, 
  # then yesterday's price is not the true maximum.

  dd_for_update_anon[MaxPriceSoFar==app_yest & app>= MaxPriceSoFar, MaxPriceSoFar:=NA]
  
  cols <- c( "MaxPriceSoFar")
  (dd_for_update_anon[, (cols) := na.locf(.SD, na.rm = F), by=.(Code_used_DS, pur_day.x),  .SDcols = cols]) 
  
  
  dd_for_update_anon[, order:=seq_along(port_date), by=.(Code_used_DS, MaxPriceSoFar, pur_day.x )]
  
  dd_for_update_anon[MaxPriceSoFar==app_yest & order>1]
  
  dd_for_update_anon[MaxPriceSoFar==app_yest & order==1, date_app_peak:=port_date_yest]
  
  
  cols <- c( "date_app_peak")
  (dd_for_update_anon[, (cols) := na.locf(.SD, na.rm = F), by=.(Code_used_DS, MaxPriceSoFar, pur_day.x),  .SDcols = cols]) 
  
  
  dd_for_update_anon[,port_date_yest:=NULL]
  dd_for_update_anon[,date_app_peak:=as.Date(date_app_peak, "%Y-%m-%d")]
  setnames(dd_for_update_anon, "date_app_peak", "date_past_peak")
  setnames(dd_for_update_anon, "MaxPriceSoFar", "app_past_peak")
  
  # distance to past peak
  
  dd_for_update_anon[, distance_to_peak:=as.numeric(port_date-date_past_peak)]

  dd_for_update_anon[,pass:=0]
  dd_for_update_anon[app>app_past_peak, pass:=1]
  dd_for_update_anon[is.na(app_past_peak), pass:=NA]
  
  setnames(dd_for_update_anon, "app", "appDS")
  setnames(dd_for_update_anon, "pass", "pass_peak")
  
  dd_for_update_anon[ , app_past_peak_Nlag:=shift(app_past_peak,update_days,NA,"lag") ,by=.(Code_used_DS, pur_day.x)] 
  dd_for_update_anon[app_past_peak==app_past_peak_Nlag, app_past_peak_rest:=app_past_peak]
  
  dd_for_update_anon[ , date_past_peak_Nlag:=shift(date_past_peak,update_days,NA,"lag") ,by=.(Code_used_DS, pur_day.x)] 
  dd_for_update_anon[date_past_peak==date_past_peak_Nlag, date_past_peak_rest:=date_past_peak]
  dd_for_update_anon[is.na(date_past_peak_rest), app_past_peak_rest:=NA]
  
  cols <- c( "app_past_peak_rest")
  (dd_for_update_anon[, (cols) := na.locf(.SD, na.rm = F), by=.(Code_used_DS, pur_day.x),  .SDcols = cols]) 
  
  cols <- c( "date_past_peak_rest")
  (dd_for_update_anon[, (cols) := na.locf(.SD, na.rm = F), by=.(Code_used_DS, pur_day.x),  .SDcols = cols]) 
  
  dd_for_update_anon[,app_past_peak_Nlag:=NULL]
  dd_for_update_anon[,date_past_peak_Nlag:=NULL]
  dd_for_update_anon[,pass_peak_rest:=0]
  dd_for_update_anon[appDS>app_past_peak_rest, pass_peak_rest:=1]
  dd_for_update_anon[is.na(app_past_peak_rest), pass_peak_rest:=NA]
  dd_for_update_anon[,app_yest:=NULL]

  anon_period<-unique(data.and.login.date.m[anon==anon_used,.(anon, Code_used_DS, port_date)])
  
  anon_period<-merge(anon_period, dd_for_update_anon, by=c("Code_used_DS",  "port_date"), all.x=T)
  
  print(number_anon)
  
  if (show_merged=="YES"){
    anon_period
  } else{
    dd_for_update_anon
  }
}


#####################################
# -----------------------------------------------
# Executing the `finding_peak_MAX` Function Across All Accounts
# ------------------------------------------------------------
# We are executing the `finding_peak_MAX` function across all accounts in the dataset.
# First, this analysis uses an `update_days` value of 5, corresponding to a week of business days.
# The function determines the highest price each investor has experienced since the purchase of the stock,
# updating the peak price if a new high is sustained for at least 5 business days.
# The output is stored in `MAXC_peak_5update.csv`.
#
# Next, we rerun the function with an `update_days` value of 20, corresponding to approximately one month.
# The output for this run is stored in `MAXC_peak_20update.csv`.
#
#####################################


{
  
  list_peak_anonMAX<- lapply(1:nrow(list_anon), function(x)   {finding_peak_MAX(number_anon = x , show_merged = "YES", update_days = 5)   }  )

  df.listMAX <- setDT(rbindlist(list_peak_anonMAX, fill=TRUE)  )
  df.listMAX<-unique(df.listMAX)
  df.listMAX[,anon.y:=NULL]
  setnames(df.listMAX, "anon.x", "anon")
  df.listMAX[,order:=seq_along(appDS), by=.(anon, Code_used_DS, port_date)]
  df.listMAX<-df.listMAX[order==1]

  df.listMAX[,order:=NULL]

  setnames(df.listMAX, "appDS","appDS_MAXC_up5")
  setnames(df.listMAX,"pur_day.x", "pur_day_MAXC_up5")
  setnames(df.listMAX, "app_past_peak", "app_past_peak_MAXC_NOup5") 
  setnames(df.listMAX, "date_past_peak", "date_past_peak_MAXC_NOup5")
  setnames(df.listMAX, "distance_to_peak", "distance_to_peak_MAXC_NOup5")
  setnames(df.listMAX, "pass_peak", "pass_peak_MAXC_NOup5")
  setnames(df.listMAX, "app_past_peak_rest", "app_past_peak_MAXC_up5")
  setnames(df.listMAX, "date_past_peak_rest", "date_past_peak_MAXC_up5")
  setnames(df.listMAX, "pass_peak_rest", "pass_peak_MAXC_up5")
  
  name_file_use<- paste0("MAXC_peak_5update.csv")
  
  output_file <- file.path(wd_peak, name_file_use)           
  
  fwrite(df.listMAX , file=output_file, sep=',', row.names=F, col.names=T)
  
  
  # Relevant Variables:
  # -------------------
  # app_past_peak_MAXC_up5:    The peak price since purchase that remained the highest for a minimum of 5 days.
  # date_past_peak_MAXC_up5:   The date when the `app_past_peak_MAXC_up5` peak price was recorded.
  # pass_peak_MAXC_up5:        A dummy variable equal to one if the current price has surpassed the `app_past_peak_MAXC_up5` peak price.

  
  list_peak_anonMAX<- lapply(1:nrow(list_anon), function(x)   {finding_peak_MAX(number_anon = x , show_merged = "YES", update_days = 20)   }  )
  df.listMAX <- setDT(rbindlist(list_peak_anonMAX, fill=TRUE)  )

  df.listMAX<-unique(df.listMAX)
  df.listMAX[,anon.y:=NULL]
  setnames(df.listMAX, "anon.x", "anon")
  df.listMAX[,order:=seq_along(appDS), by=.(anon, Code_used_DS, port_date)]
  df.listMAX<-df.listMAX[order==1]
  df.listMAX[,order:=NULL]
  
  setnames(df.listMAX, "appDS","appDS_MAXC_up20")
  setnames(df.listMAX,"pur_day.x", "pur_day_MAXC_up20")
  setnames(df.listMAX, "app_past_peak", "app_past_peak_MAXC_NOup20") 
  setnames(df.listMAX, "date_past_peak", "date_past_peak_MAXC_NOup20")
  setnames(df.listMAX, "distance_to_peak", "distance_to_peak_MAXC_NOup20")
  setnames(df.listMAX, "pass_peak", "pass_peak_MAXC_NOup20")
  setnames(df.listMAX, "app_past_peak_rest", "app_past_peak_MAXC_up20")
  setnames(df.listMAX, "date_past_peak_rest", "date_past_peak_MAXC_up20")
  setnames(df.listMAX, "pass_peak_rest", "pass_peak_MAXC_up20")
  
  # Relevant Variables:
  # -------------------
  # app_past_peak_MAXC_up20:   The peak price since purchase that remained the highest for a minimum of 20 days (approximately one month).
  # date_past_peak_MAXC_up20:  The date when the `app_past_peak_MAXC_up20` peak price was recorded.
  # pass_peak_MAXC_up20:       A dummy variable equal to one if the current price has surpassed the `app_past_peak_MAXC_up20` peak price.
  
  
  
  name_file_use<- paste0("MAXC_peak_20update.csv")
  
  output_file <- file.path(wd_peak, name_file_use)           
  
  fwrite(df.listMAX , file=output_file, sep=',', row.names=F, col.names=T)
}


##############################
# Saving the cleaned data where each observation represents an account, stock, and date 
# in `data_peak_prices_cleaned.csv`.
#
# Saving the cross-sectional data where each observation represents an account 
# in `data_peak_cross_sec_acc.csv`.
##############################


name_file_csv<- paste0("data_peak_prices_cleaned.csv")
output_file <- file.path(wd_peak, name_file_csv)            
fwrite(data.and.login.date.m , file=output_file, sep=',', row.names=F, col.names=T)


name_file_csv<- paste0("data_peak_cross_sec_acc.csv")
output_file <- file.path(wd_peak, name_file_csv)
fwrite(for_stat_acc, file=output_file, sep=',', row.names=F, col.names=T)



